import pandas as pd
from nltk.tokenize import word_tokenize
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


# Fungsi preprocessing teks 
def preprocess_text(text):
    factory = StopWordRemoverFactory()
    stopword_remover = factory.create_stop_word_remover()
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha()] 
    tokens = [stopword_remover.remove(word) for word in tokens]
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

# Fungsi sentiment analysis
def analyze_sentiment(text):
    positive_words = set(open("positive_words.txt", "r").read().splitlines())
    negative_words = set(open("negative_words.txt", "r").read().splitlines())
    tokens = word_tokenize(text)
    positive_count = sum(1 for word in tokens if word in positive_words)
    negative_count = sum(1 for word in tokens if word in negative_words)
    if positive_count > negative_count:
        return 'positif'
    elif positive_count < negative_count:
        return 'negatif'
    else:
        return 'netral'
    
# Fungsi Topic modeling (LDA)
def perform_topic_modeling(data):
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words=None)
    tf = vectorizer.fit_transform(data)
    lda_model = LatentDirichletAllocation(n_components=6, learning_method='online', random_state=42, n_jobs=-1)
    lda_model.fit(tf)
    
    features = vectorizer.get_feature_names_out()
    
    topics = []
    
    for idx, topic in enumerate(lda_model.components_):
        top_features_indices = topic.argsort()[:-10 - 1:-1]
        top_features = [features[i] for i in top_features_indices]
        
        topics.append(top_features)
            
    return topics

def plot_topic_modeling(topics, word_freq_dict, sentiment):
    topic_freq = {tuple(topic): sum(word_freq_dict[word] for word in topic) for topic in topics}
    sorted_topic_freq = dict(sorted(topic_freq.items(), key=lambda item: item[1], reverse=True))

    topics = ['Topik: ' + ', '.join(topic) for topic in sorted_topic_freq.keys()]
    counts = list(sorted_topic_freq.values())

    plt.figure(figsize=(12, 8))
    plt.barh(topics, counts, color='orange')
    plt.xlabel('Jumlah Kata')
    plt.ylabel('Topik')
    plt.title(f'Topik yang Dibahas dalam Ulasan ({sentiment.capitalize()})')
    
    plt.gca().invert_yaxis()
    plt.tight_layout()

    plt.show()


# Membaca data ulasan dari file CSV
with open('google.csv', 'r', encoding='utf-8') as file:
    data = file.readlines()

# Membuat DataFrame dari data
df = pd.DataFrame(data, columns=['Ulasan'])

# Running Preprocessing teks
df['Preprocessed_Ulasan'] = df['Ulasan'].apply(preprocess_text)

# Running Analisis sentimen
df['Sentimen'] = df['Preprocessed_Ulasan'].apply(analyze_sentiment)
print("\nHasil Analisis Sentimen:\n", df[['Ulasan', 'Sentimen']])

df['Label_Sentimen'] = df['Sentimen'].map({'positif': 1, 'negatif': -1, 'netral': 0})

# Simpan Hasil
df.to_csv('google_labeled.csv', index=False, encoding='utf-8')

# Membagi data 
def split_data(df, test_size=0.2, random_state=42):
    _, test_df = train_test_split(df, test_size=test_size, random_state=random_state)
    return test_df

labeled_20persen_df = split_data(df, test_size=0.2)
labeled_20persen_df.to_csv('labeled_20persen.csv', index=False)

# Running Topic modeling
topics = perform_topic_modeling(df['Preprocessed_Ulasan'])

# Menyusun frekuensi kata untuk seluruh data
word_freq_dict = df['Preprocessed_Ulasan'].str.split(expand=True).stack().value_counts().to_dict()

# Grafik
for sentiment in ['positif', 'negatif', 'netral']:
    sentiment_df = df[df['Sentimen'] == sentiment]
    preprocessed_data = sentiment_df['Preprocessed_Ulasan'].tolist()
    topics = perform_topic_modeling(preprocessed_data)
   
    flattened_topics = {}
    for topic in topics:
        for word in topic:
            flattened_topics[word] = flattened_topics.get(word, 0) + 1
    
    plot_topic_modeling(topics, flattened_topics, sentiment)


# Kesimpulan
positif_count = df['Sentimen'].value_counts().get('positif', 0)
negatif_count = df['Sentimen'].value_counts().get('negatif', 0)
netral_count = df['Sentimen'].value_counts().get('netral', 0)

print("\nKesimpulan:")
print("Jumlah ulasan positif:", positif_count)
print("Jumlah ulasan negatif:", negatif_count)
print("Jumlah ulasan netral:", netral_count)

if positif_count > negatif_count:
    print("Ulasan dalam dataset menunjukkan mayoritas sentimen POSITIF.")
elif positif_count < negatif_count:
    print("Ulasan dalam dataset menunjukkan mayoritas sentimen NEGATIF.")
else:
    print("Ulasan dalam dataset menunjukkan mayoritas sentimen NETRAL.")
